Introduction

The aim is to explore the annotations and label transfers for all of the samples in SCPCP000006.

In order to explore the label transfer results, we look into some marker genes, table and percentages of cells in each annotation groups (from label transfers).

We also looked into the predicted.score for each of the compartments and explored the potential effects of score thresholds. In this notebook, we fixed the predicted.score threshold to 0.85.

Packages

library(Seurat)
library(tidyverse)
library(tidyr)
library(patchwork)
library(DT)

Base directories

# The base path for the OpenScPCA repository, found by its (hidden) .git directory
repository_base <- rprojroot::find_root(rprojroot::is_git_root)

# The current data directory, found within the repository base directory
data_dir <- file.path(repository_base, "data", "current", "SCPCP000006")

# The path to this module
module_base <- file.path(repository_base, "analyses", "cell-type-wilms-tumor-06")

result_dir <- file.path(module_base, "results")

Input files

In this notebook, we are working with all of the samples in SCPCP000006.

The sample metadata can be found in sample_metadata_file in the data folder.

We extracted from the pre-processed and labeled Seurat object (that is the output of 02b_label-transfer_fetal_kidney_reference_Stewart.Rmd saved in the results directory) the following information per cell:

  • the predicted compartment in fetal_kidney_predicted.compartment

  • the predicted organ in fetal_full_predicted.organ

  • the sample identifier in sample_id

  • the countsof the endothelial marker VWF = “ENSG00000110799”

  • the countsof the immune marker PTPRC = “ENSG00000081237”

sample_metadata_file <- file.path(repository_base, "data", "current", "SCPCP000006", "single_cell_metadata.tsv")

metadata <- read.table(sample_metadata_file, sep = "\t", header = TRUE) |>
  dplyr::filter(seq_unit != "spot") 


# Create a data frames of all annotations
cell_type_df <- metadata$scpca_sample_id |>
  purrr::map(
    # For each sample_id, do the following:
    \(sample_id) {
      # Read in the Seurat object
      srat <- readRDS(
        file.path(result_dir, sample_id, paste0("02b-fetal_kidney_label-transfer_", sample_id, ".Rds"))
      )

      # Create and return a data frame from the Seurat object with relevant annotations
      # this data frame will have six columns: barcode, compartment, organ, compartment_score, VWF and PTPRC
      data.frame(
        compartment = srat$fetal_kidney_predicted.compartment,
        compartment_score = srat$fetal_kidney_predicted.compartment.score,
        organ = srat$fetal_full_predicted.organ,
        PTPRC = FetchData(object = srat, vars = "ENSG00000081237", layer = "counts"),
        VWF = FetchData(object = srat, vars = "ENSG00000110799", layer = "counts"),
        umap = srat@reductions$umap@cell.embeddings
      ) |>
        tibble::rownames_to_column("barcode") |>
        dplyr::mutate(sample_id = sample_id)
    }
  ) |>
  # now combine all dataframes to make one big one
  dplyr::bind_rows() |>
  # Combine with metadata
  dplyr::left_join(metadata, by = c("sample_id" = "scpca_sample_id")) |>
  # Create column for whether the compartment score passes threshold
  dplyr::mutate(pass_mapping_QC = compartment_score > params$predicted.score_thr)


# we create a data frames of annotation of cells that pass the `predicted.score` threshold
cell_type_df_pass <- cell_type_df |>
  dplyr::filter(pass_mapping_QC)

# we create a data frames of annotation of cells that don't pass the `predicted.score` threshold
cell_type_df_nopass <- cell_type_df |>
  dplyr::filter(!pass_mapping_QC)

Output file

The report will be saved in the notebook directory.

Functions

do_Feature_mean

do_Feature_mean shows heatmap of mean expression of a feature grouped by a metadata.

  • df is the name of the table containing metadata and feature expression (counts) per cells
  • group.by is the name of the metadata to group the cells
  • feature is the name of the gene to average the expression
do_Feature_mean <- function(df, group.by, feature) {
  df <- df %>%
    group_by(sample_id, !!sym(group.by)) %>%
    summarise(m = mean(!!sym(feature)))


  p <- ggplot(df, aes(x = sample_id, y = !!sym(group.by), fill = m)) +
    geom_tile() +
    scale_fill_viridis_c() +
    theme_bw() +
    theme(text = element_text(size = 20)) +
    theme(axis.text.x = element_text(angle = 90, hjust = 0.5, vjust = 0.5), title = element_text(size = rel(0.75))) +
    guides(fill = guide_colourbar(title = paste0(feature)))

  return(p)
}

do_Feature_boxplot

do_Feature_boxplot shows boxplot of expression of a feature grouped by a metadata.

  • df is the name of the table containing metadata and feature expression (counts) per cells
  • feature is the name of the gene to average the expression
  • group.by is the name of the metadata to group the cells
  • split.by is the name of the metadata to split the plots
do_Feature_boxplot <- function(df, group.by, feature, split.by) {
  df <- df %>%
    mutate(!!sym(group.by), group = factor(!!sym(group.by), levels = c("fetal_nephron", "stroma", "immune", "endothelium")))

  p <- ggplot(
    df,
    aes(x = group, y = !!sym(feature), fill = group)
  ) +
    geom_boxplot(size = 0.5, size.outlier = 0.25) +
    facet_wrap(vars(!!sym(split.by)), scale = "free_y", ncol = 4) +
    theme_bw() +
    theme(
      axis.text.x = element_text(size = 12, angle = 0, hjust = 0.5, vjust = 0),
      legend.position = "none"
    )

  return(p)
}

do_Feature_densityplot

do_Feature_densityplot shows boxplot of expression of a feature grouped by a metadata.

  • df is the name of the table containing metadata and feature expression (counts) per cells
  • group.by is the name of the metadata to group the cells
  • feature is the name of the gene to average the expression
do_Feature_densityplot <- function(df, group.by, feature) {
  df <- df %>%
    mutate(!!sym(group.by), group = factor(!!sym(group.by), levels = c("fetal_nephron", "stroma", "immune", "endothelium")))

  p <- ggplot(
    df,
    aes(x = !!sym(feature), fill = group)
  ) +
    geom_density() +
    facet_wrap(vars(!!sym(group.by)), scale = "free_y", ncol = 4) +
    theme_bw()

  return(p)
}

Analysis

Total number of cells per samples and predicted organ

The predicted organ comes from label transfer from the human fetal atlas. This is the reference developed by Cao et al. provided by Azimuth for label transfer. The reference contain cells from 15 organs including kidney from fetal samples. The label transfer have been performed in the notebook 02a_fetal_full_reference_Cao_{sample_id}.Rmd

This notebook looks at the fetal_full_predicted.organ annotation from this reference.

Here, we are summarizing per sample:

  • the percentage of cells labeled as kidney, after label transfer from the fetal full reference Cao et al.,

  • the percentage of cells not labeled as kidney, after label transfer from the fetal full reference Cao et al.,

  • the total number of cells per sample.

kidney_count_df <- cell_type_df |>
  # make a new variable that tells us if it's kidney or not
  dplyr::mutate(organ_type = ifelse(organ == "Kidney", "kidney", "not_kidney")) |>
  # count how many kidney or not
  dplyr::count(sample_id, organ_type) |>
  # make the data frame wide and use a value of 0 if a count is missing
  tidyr::pivot_wider(names_from = organ_type, values_from = n, values_fill = 0) |>
  # add a column for total, and convert other counts to percentages
  dplyr::mutate(
    total = kidney + not_kidney,
    kidney = round(kidney / total, 4) * 100,
    not_kidney = round(not_kidney / total, 4) * 100
  ) |>
  # arrange in descending order of percentage of kidney cells
  dplyr::arrange(desc(kidney))

DT::datatable(kidney_count_df,
  caption = "percentage of cells mapping to the predicted organ kidney",
  extensions = "Buttons",
  options = list(
    dom = "Bfrtip",
    buttons = c("csv", "excel")
  )
)

Predicted compartment

The predicted compartment is the result of the label transfer fron the human fetal kidney atlas: Stewart et al. created a human fetal kidney atlas. This reference contains only fetal kidney cells and has been precisely annotated by kidney experts. The label transfer have been performed in the notebook 02b_fetal_kidney_reference_Stewart_{sample_id}.Rmd

The fetal kidney reference (Stewart et al.) provides two levels of annotations:

  • fetal_kidney_predicted.compartment is one of the 4 main compartments composing a fetal kidney. We expect immune and endothelial cells to be healthy (non-canerous) cells identified easily with high confidency. We expect stroma and fetal nephron compartment to contain both normal and malignant cells.

    • immune cells

    • stroma cells

    • fetal_nephron cells

    • endothelial cells

For each compartment, we sumarized the percentage of cells that do match kidney annotation or not. Please note that this table is not sample-specific but contains all samples pooled into one.

compartment_count_df <- cell_type_df |>
  # make a new variable that tells us if it's kidney or not
  dplyr::mutate(organ_type = ifelse(organ == "Kidney", "kidney", "not_kidney")) |>
  # count how many kidney or not
  dplyr::count(compartment, organ_type) |>
  # make the data frame wide and use a value of 0 if a count is missing
  tidyr::pivot_wider(names_from = organ_type, values_from = n, values_fill = 0) |>
  # add a column for total, and convert other counts to percentages
  dplyr::mutate(
    total = kidney + not_kidney,
    kidney = round(kidney / total, 4) * 100,
    not_kidney = round(not_kidney / total, 4) * 100
  ) |>
  # arrange in descending order of percentage of kidney cells
  dplyr::arrange(desc(kidney))

DT::datatable(compartment_count_df,
  caption = "percentage of cells mapping to the predicted organ kidney",
  extensions = "Buttons",
  options = list(
    dom = "Bfrtip",
    buttons = c("csv", "excel")
  )
)

What is the predicted organ of cells that are not labeled as kidney cell? Please note that this table is not sample-specific but contains all samples pooled into one.

compartment_df <- cell_type_df |>
  # count how many per compartment
  dplyr::count(organ, compartment) |>
  # make the table wide and use a value of 0 if a count is missing
  tidyr::pivot_wider(names_from = compartment, values_from = n, values_fill = 0) |>
  # order the columns how we want to show them
  dplyr::select(organ, fetal_nephron, stroma, endothelium, immune)

DT::datatable(compartment_df,
  caption = "counts of cells in each compartment",
  extensions = "Buttons",
  options = list(
    dom = "Bfrtip",
    buttons = c("csv", "excel")
  )
)

We also checked the number of cell in each compartment per sample, to assess the presence/absence of non-cancer cells (endothelia and immune) that could help the inference of copy number alterations.

compartment_df <- cell_type_df |>
  # count how many per compartment
  dplyr::count(sample_id, compartment) |>
  # make the table wide and use a value of 0 if a count is missing
  tidyr::pivot_wider(names_from = compartment, values_from = n, values_fill = 0) |>
  # order the columns how we want to show them
  dplyr::select(sample_id, fetal_nephron, stroma, endothelium, immune)

DT::datatable(compartment_df,
  caption = "counts of cells in each compartment",
  extensions = "Buttons",
  options = list(
    dom = "Bfrtip",
    buttons = c("csv", "excel")
  )
)

Label transfer predicted.score for the four compartments

The vertical line drawn correspods to the threshold explored in the notebook.

p <- do_Feature_densityplot(
  df = cell_type_df,
  feature = "compartment_score",
  group.by = "compartment"
)
p + geom_vline(xintercept = params$predicted.score_thr)

Some cells have an excellent predicted.score (> 0.85) while other performed worse. Let’s have a look at the compartments per patient.

p <- do_Feature_boxplot(
  df = cell_type_df,
  feature = "compartment_score",
  group.by = "compartment",
  split.by = "sample_id"
)

p + geom_hline(yintercept = params$predicted.score_thr) + coord_cartesian(ylim = c(0.25, 1))

Regarding the endothelial cells, some patient have very high predicted.score while others perform poorly. Let’s have a look if this can be linked to some metadata/clinical data.

Treatment
p <- do_Feature_boxplot(
  df = cell_type_df,
  feature = "compartment_score",
  group.by = "compartment",
  split.by = "treatment"
)

p + geom_hline(yintercept = params$predicted.score_thr) + coord_cartesian(ylim = c(0.25, 1))

Subdiagnosis
p <- do_Feature_boxplot(
  df = cell_type_df,
  feature = "compartment_score",
  group.by = "compartment",
  split.by = "subdiagnosis"
)

p + geom_hline(yintercept = params$predicted.score_thr) + coord_cartesian(ylim = c(0.25, 1))

Disease timing
p <- do_Feature_boxplot(
  df = cell_type_df,
  feature = "compartment_score",
  group.by = "compartment",
  split.by = "disease_timing"
)

p + geom_hline(yintercept = params$predicted.score_thr) + coord_cartesian(ylim = c(0.25, 1))

We do not really see a pattern linking the label transfer score and some meta/clinical data.

We finally look for each patient and compartment, the number of cells that have a predicted.score > 0.85 (TRUE).

scores_df <- cell_type_df |>
  # count how many per compartment
  dplyr::count(sample_id, compartment, pass_mapping_QC) |>
  # make the table wide and use a value of 0 if a count is missing
  tidyr::pivot_wider(names_from = pass_mapping_QC, values_from = n, values_fill = 0)


DT::datatable(scores_df,
  caption = "counts of cells that have a good/poor mapping in each compartment",
  extensions = "Buttons",
  options = list(
    dom = "Bfrtip",
    buttons = c("csv", "excel")
  )
)

Evaluate annotations with marker genes

Here we evaluate with marker genes the identification of endothelial, immune cells sub-populations. We expect that cells labeled as endothelial and immune cells will have higher expression of these markers compared to other cell types.

Endothelial cells

We look at the endothelial marker “ENSG00000110799” = “VWF”

do_Feature_mean(
  df = cell_type_df,
  feature = "ENSG00000110799",
  group.by = "compartment"
) +
  ggtitle("VWF expression averaged by compartment for all cells")

do_Feature_mean(
  df = cell_type_df_pass,
  feature = "ENSG00000110799",
  group.by = "compartment"
) +
  ggtitle("VWF expression averaged by compartment for cells passing the `predicted.score` threshold")

do_Feature_mean(
  df = cell_type_df_nopass,
  feature = "ENSG00000110799",
  group.by = "compartment"
) +
  ggtitle("VWF expression averaged by compartment for cells that don't pass the `predicted.score` threshold")

do_Feature_boxplot(
  df = cell_type_df,
  feature = "ENSG00000110799",
  group.by = "compartment",
  split.by = "sample_id"
) +
  ggtitle("boxplot of VWF expression for all cells")

Immune cells

We look at the immune marker “ENSG00000081237” = “PTPRC” alias “CD45”

do_Feature_mean(
  df = cell_type_df,
  feature = "ENSG00000081237",
  group.by = "compartment"
) +
  ggtitle("PTPRC expression averaged by compartment for all cells")

do_Feature_mean(
  df = cell_type_df_pass,
  feature = "ENSG00000081237",
  group.by = "compartment"
) +
  ggtitle("PTPRC expression averaged by compartment for cells passing the `predicted.score` threshold")

do_Feature_mean(
  df = cell_type_df_nopass,
  feature = "ENSG00000081237",
  group.by = "compartment"
) +
  ggtitle("PTPRC expression averaged by compartment for cells that are not passing the predicted score threshold")

do_Feature_boxplot(
  df = cell_type_df,
  feature = "ENSG00000081237",
  group.by = "compartment",
  split.by = "sample_id"
) +
  ggtitle("boxplot of PTPRC expression for all cells")

Umap reduction

We look at the umap reduction per sample.

All cells

Point are colored per compartment:

  • red = endothelial

  • green = fetal nephron

  • cyan = immune

  • purple = stroma

Black cross show cell that do not pass the predicted.score threshold.

ggplot(cell_type_df, aes(x = umap.umap_1, y = umap.umap_2, color = compartment), shape = 19, size = 2) +
  geom_point() +
  geom_point(data = cell_type_df_nopass, mapping = aes(x = umap.umap_1, y = umap.umap_2), shape = 4, color = "black", alpha = 0.3) +
  facet_wrap(facets = ~sample_id)

Cells passing the predicted.score threshold

Point are colored per compartment. Here we only plotted cells that do pass the predicted.score threshold.

ggplot(cell_type_df_pass, aes(x = umap.umap_1, y = umap.umap_2, color = compartment), shape = 19, size = 2) +
  geom_point() +
  facet_wrap(facets = ~sample_id)

Conclusion

Assessment of the quality of label transfer

Looking at the predicted.organ label (from the human fetal atlas, Cao et al.), we can be confident about the label transfer.

The majority of fetal nephron cell (92%) has been predicted as kidney. The fact that the other compartments (i.e. stroma, endothelial and immune) do not really match to kidney cells (less than one third predicted as kidney) shouldn’t be a concern and and shouldn’t be interpreted as a poor label transfer or annotation.

It is indeed known that Wilms tumor stroma (sometimes) shows (unexpected) differentiation into cell types such as skeletal muscle cells, fat tissue, cartilage, bone and even glial cells [1-2]. For that reason, we are not surprised that most stroma cells are not predicted as kidney cells.

Regarding the immune compartment, it is not of a surprise that cancer cells, and/or treatment, modulate the immune microenvironment, via the attraction of immune cells that are not usually in the kidney and/or induction of a cancer-associated phenotype. For that reason, we are not surprised that immune cells can be resembling immune cells from other organs.

Next step

The next step of the analysis is to test whether copyKAT can help us distinguishing cancer cells from non-malignant cell types. copyKAT (Copynumber Karyotyping of Tumors) is a computational tool using integrative Bayesian approaches to identify genome-wide aneuploidy in single cells to separate tumor cells from normal cells, using high-throughput sc-RNAseq data. The underlying logic for calculating DNA copy number events from RNAseq data is that gene expression levels of many adjacent genes can provide depth information to infer genomic copy number in that region.

Of note, it is known that CopyKAT has difficulty in predicting tumor and normal cells in the cases of pediatric and liquid tumors that have a few CNAs, such as Wilms tumors. CopyKAT provides two ways to bypass this:

  • by providing a vector of cell names of known normal cells from the same dataset

  • or by try to search for T cells.

For that reason, we will first try to run CopyKAT on 2-5 samples that have a majority of cells mapping to kidney and a decent amount (>100 cells when possible) of normal cells (endothelial and immune cells) and for which we are quite confident with the label transfer:

  • sample SCPCS000194

  • sample SCPCS000179

  • sample SCPCS000184

  • sample SCPCS000205

  • sample SCPCS0000208

For the selected samples, we will run copyKAT with and without providing a list of normal cells. That will be useful to us to see how consistent results are. Also it will inform us about how we might analyze samples that aren’t predicted to have that many (or sometimes any at all!) endothelial or immune cells.

Session Info

# record the versions of the packages used in this analysis and other environment information
sessionInfo()
## R version 4.4.1 (2024-06-14)
## Platform: aarch64-apple-darwin20
## Running under: macOS 15.1
## 
## Matrix products: default
## BLAS:   /Library/Frameworks/R.framework/Versions/4.4-arm64/Resources/lib/libRblas.0.dylib 
## LAPACK: /Library/Frameworks/R.framework/Versions/4.4-arm64/Resources/lib/libRlapack.dylib;  LAPACK version 3.12.0
## 
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
## 
## time zone: America/New_York
## tzcode source: internal
## 
## attached base packages:
## [1] stats     graphics  grDevices datasets  utils     methods   base     
## 
## other attached packages:
##  [1] DT_0.33            patchwork_1.2.0    lubridate_1.9.3    forcats_1.0.0     
##  [5] stringr_1.5.1      dplyr_1.1.4        purrr_1.0.2        readr_2.1.5       
##  [9] tidyr_1.3.1        tibble_3.2.1       ggplot2_3.5.1      tidyverse_2.0.0   
## [13] Seurat_5.1.0       SeuratObject_5.0.2 sp_2.1-4          
## 
## loaded via a namespace (and not attached):
##   [1] RColorBrewer_1.1-3     jsonlite_1.8.8         magrittr_2.0.3        
##   [4] spatstat.utils_3.1-0   farver_2.1.2           rmarkdown_2.28        
##   [7] vctrs_0.6.5            ROCR_1.0-11            spatstat.explore_3.3-2
##  [10] htmltools_0.5.8.1      sass_0.4.9             sctransform_0.4.1     
##  [13] parallelly_1.38.0      KernSmooth_2.23-24     bslib_0.8.0           
##  [16] htmlwidgets_1.6.4      ica_1.0-3              plyr_1.8.9            
##  [19] plotly_4.10.4          zoo_1.8-12             cachem_1.1.0          
##  [22] igraph_2.0.3           mime_0.12              lifecycle_1.0.4       
##  [25] pkgconfig_2.0.3        Matrix_1.7-0           R6_2.5.1              
##  [28] fastmap_1.2.0          fitdistrplus_1.2-1     future_1.34.0         
##  [31] shiny_1.9.1            digest_0.6.37          colorspace_2.1-1      
##  [34] rprojroot_2.0.4        tensor_1.5             RSpectra_0.16-2       
##  [37] irlba_2.3.5.1          crosstalk_1.2.1        labeling_0.4.3        
##  [40] progressr_0.14.0       fansi_1.0.6            spatstat.sparse_3.1-0 
##  [43] timechange_0.3.0       httr_1.4.7             polyclip_1.10-7       
##  [46] abind_1.4-5            compiler_4.4.1         withr_3.0.1           
##  [49] fastDummies_1.7.4      highr_0.11             MASS_7.3-61           
##  [52] tools_4.4.1            lmtest_0.9-40          httpuv_1.6.15         
##  [55] future.apply_1.11.2    goftest_1.2-3          glue_1.7.0            
##  [58] nlme_3.1-166           promises_1.3.0         grid_4.4.1            
##  [61] Rtsne_0.17             cluster_2.1.6          reshape2_1.4.4        
##  [64] generics_0.1.3         gtable_0.3.5           spatstat.data_3.1-2   
##  [67] tzdb_0.4.0             data.table_1.16.0      hms_1.1.3             
##  [70] utf8_1.2.4             spatstat.geom_3.3-2    RcppAnnoy_0.0.22      
##  [73] ggrepel_0.9.5          RANN_2.6.2             pillar_1.9.0          
##  [76] spam_2.10-0            RcppHNSW_0.6.0         later_1.3.2           
##  [79] splines_4.4.1          lattice_0.22-6         renv_1.0.7            
##  [82] survival_3.7-0         deldir_2.0-4           tidyselect_1.2.1      
##  [85] miniUI_0.1.1.1         pbapply_1.7-2          knitr_1.48            
##  [88] gridExtra_2.3          scattermore_1.2        xfun_0.47             
##  [91] matrixStats_1.3.0      stringi_1.8.4          lazyeval_0.2.2        
##  [94] yaml_2.3.10            evaluate_0.24.0        codetools_0.2-20      
##  [97] BiocManager_1.30.25    cli_3.6.3              uwot_0.2.2            
## [100] xtable_1.8-4           reticulate_1.38.0      munsell_0.5.1         
## [103] jquerylib_0.1.4        Rcpp_1.0.13            globals_0.16.3        
## [106] spatstat.random_3.3-1  png_0.1-8              spatstat.univar_3.0-0 
## [109] parallel_4.4.1         dotCall64_1.1-1        listenv_0.9.1         
## [112] viridisLite_0.4.2      scales_1.3.0           ggridges_0.5.6        
## [115] leiden_0.4.3.1         rlang_1.1.4            cowplot_1.1.3